# encoding: utf-8
"""
sft数据 医疗问答数据
{"instruction": "胰岛素强化治疗的推荐药有些什么？", "input": "", "output": "预混胰岛素类似物"}
"""

import json

file_path = "G:/datas/medical_zh/train_zh_0.jsonl"

# 20万
train_size = 210000

start = 0

with open(file_path, "r", encoding="utf-8") as f:
    with open("./datas/medical_sft_eval.json", "w", encoding="utf-8") as mf:
        for line in f:
            start += 1
            if start < 200000:
                continue
            mf.write(line)

            if start > train_size:
                break
